In [1]:
import numpy as np, pandas as pd
from matplotlib import pylab
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault) # 恢复matplotlib默认样式
import seaborn as sns
sns.set_style('whitegrid')
# plt.style.use('ggplot')
# https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html
import warnings
warnings.filterwarnings('ignore')

matplotlib¶

Comparison between stateful and stateless approach¶

In [3]:
x=np.linspace(0,10,50)
np.random.seed(10)
# stateful
plt.plot(x,np.sin(x)+x+np.random.randn(50)) # randn: standard normal distrubtion
plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
plt.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.title("Three Curves: Stateful (plt.plot)")

plt.show()
In [4]:
fig,ax0=plt.subplots(nrows=1) # Create a figure and a set of subplots.if multiple plots, return an array to ax. 
ax0.plot(x,np.sin(x)+x+np.random.randn(50)) # randn: return standard normal distribution
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0.set_title("Three Curves: Stateless (ax.plot)")
plt.show()
In [6]:
fig=plt.figure() # create a new figure
ax0=fig.add_subplot(211) # figure layout: 2x1. We will plot the first chart on [0,0]
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))

ax1=fig.add_subplot(212)
ax1.plot(x,np.sin(x)+x+np.random.randn(50))
ax1.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax1.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.axhline(y=10,color='purple',linestyle='--')

fig.suptitle("Subplot: Stateless Only")
plt.show()
In [9]:
fig=plt.figure()
plt.subplot(211) # two rows and one column, [0,0]
plt.plot(x,np.sin(x)+x+np.random.randn(50))
plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
plt.plot(x,np.sin(x)+2*x+np.random.rand(50))

ax0=fig.add_subplot(212) # two rows and one column, [1,0]
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.axhline(y=10,color='purple',linestyle='--') # 可以放其他数值,比如平均数,中位数之类的。

fig.suptitle("Subplot: Stateful & Stateless")
plt.show()

Adjust Coordinate¶

In [11]:
x=np.linspace(0,10,50)
np.random.seed(10)
fig,ax0=plt.subplots(nrows=1)
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0.set_title("Three curves",fontsize=20)

# 1. adjust gridline type: dotted-line
ax0.grid(color='gray', alpha=0.5, linestyle='dotted') # alpha to adjust grid transparency
# or hide the grid: 
# ax0.grid(False)
# 2. set x-axis label and y-axis label
ax0.set_xlabel('X')
ax0.set_ylabel('Randomization')
ax0.xaxis.label.set_size(15) # set xlabel size
ax0.yaxis.label.set_size(15) # set xlabel size
# # 3. adjust x-axis and y-axis data range
ax0.set_xticks(np.arange(min(x),max(x)+1,1)) # list of locations
# sns.despine(ax=ax0,left=True,bottom=True) # remove the left and bottom frame 
plt.show() # used to suppress 

For Loop for Subplot¶

In [12]:
x = np.linspace(0,10)
np.random.seed(10)

# 生成数据
y1 = np.sin(x)+x+np.random.randn(50)
y2 = np.sin(x)+0.5*x+np.random.randn(50)
y3 = np.sin(x)+2*x+np.random.randn(50)

df = pd.DataFrame({'serie1':y1,'serie2':y2,'serie3':y3})

fig = plt.figure()
fig.subplots_adjust(hspace=0.4)

i=1
for col in df.columns:
    plt.subplot(df.shape[1],1,i) 
    plt.plot(df.loc[:,col])
    plt.title(col,y=0.6,loc='right')
    i+=1
  
plt.show()

Case¶

In [13]:
df = pd.read_csv("UK-Bank-Customers.csv")
df.head()
Out[13]:
Customer ID Name Surname Gender Age Region Job Classification Date Joined Balance
0 100000001 Simon Walsh Male 21 England White Collar 05.Jan.15 113810.15
1 400000002 Jasmine Miller Female 34 Northern Ireland Blue Collar 06.Jan.15 36919.73
2 100000003 Liam Brown Male 46 England White Collar 07.Jan.15 101536.83
3 300000004 Trevor Parr Male 32 Wales White Collar 08.Jan.15 1421.52
4 100000005 Deirdre Pullman Female 38 England Blue Collar 09.Jan.15 35639.79
In [14]:
df.columns = ['cust_id','first_name','last_name','gender','age','region','job','date_join', 'balance']
df['age_group'] = pd.cut(df['age'], bins=[15, 30, 50, float('Inf')], labels=['15-30', '30-50', 'Above 50'])
df.head()
Out[14]:
cust_id first_name last_name gender age region job date_join balance age_group
0 100000001 Simon Walsh Male 21 England White Collar 05.Jan.15 113810.15 15-30
1 400000002 Jasmine Miller Female 34 Northern Ireland Blue Collar 06.Jan.15 36919.73 30-50
2 100000003 Liam Brown Male 46 England White Collar 07.Jan.15 101536.83 30-50
3 300000004 Trevor Parr Male 32 Wales White Collar 08.Jan.15 1421.52 30-50
4 100000005 Deirdre Pullman Female 38 England Blue Collar 09.Jan.15 35639.79 30-50
In [18]:
dt_region_mean_bal = df.groupby('region', as_index=False).agg({'balance': 'mean'})
dt_region_mean_bal['balance'] = np.rint(dt_region_mean_bal['balance'])
dt_region_mean_bal
Out[18]:
region balance
0 England 39293.0
1 Northern Ireland 39505.0
2 Scotland 39511.0
3 Wales 42390.0
In [19]:
fig,ax0 = plt.subplots(nrows=1)
ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'])
# adjust: xlabel, ylabel,y-axis scale; adjust background color
# add title
ax0.set_title("Balance by region",fontsize=20)
ax0.set_xlabel('Region')
ax0.set_ylabel('Average balance')
ax0.xaxis.label.set_size(20) # set xlabel size
ax0.yaxis.label.set_size(20) # set xlabel size
ax0.set_ylim(top=70000)
ax0.grid(False)

# what about data label?
def autolabel(rects):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax0.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%d' % int(height),
                ha='center', va='bottom') #ha=horizontal alignment

rect1=ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'],color='blue')
autolabel(rect1)
plt.show()
In [20]:
# Balance vs ( Region + Gender): Bar chart with multiple X
dt_rg_mean=df.groupby(['region','gender'])['balance'].mean().reset_index()
dt_rg_mean['balance']=np.rint(dt_rg_mean['balance'])
dt_rg_mean
Out[20]:
region gender balance
0 England Female 39989.0
1 England Male 38582.0
2 Northern Ireland Female 38769.0
3 Northern Ireland Male 41644.0
4 Scotland Female 37306.0
5 Scotland Male 40385.0
6 Wales Female 40312.0
7 Wales Male 44852.0
In [24]:
df.groupby(['region','gender'])['balance'].mean().unstack()
Out[24]:
gender Female Male
region
England 39988.979505 38581.854270
Northern Ireland 38769.423567 41643.831296
Scotland 37306.419060 40385.072099
Wales 40312.093085 44852.180714
In [25]:
# use OOP + pandas plot
fig, ax1 = plt.subplots(nrows=1)
df.groupby(['region','gender'])['balance'].mean().unstack().plot(kind='bar',ax=ax1)
ax1.set_ylim(top=70000)
ax1.set_title("Balance by region and gender")
ax1.set_ylabel('Average Balance')
ax1.grid(False)
plt.show()
In [26]:
# without unstack()
fig,ax1=plt.subplots(nrows=1)
df.groupby(['region','gender'])['balance'].mean().plot(kind='bar',ax=ax1)
ax1.set_ylim(top=70000)
ax1.set_title("Balance by region and gender")
ax1.set_ylabel('Average Balance')
ax1.grid(False)
plt.show()

Dual y-axis¶

In [28]:
# two y-axies: example, plot conversions and conversion rate on the same chart.
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
df_ts = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD'))
df_ts = df_ts.cumsum() # reduce the randomness of the data in order to show dummy trend
df_ts.head()
Out[28]:
A B C D
2000-01-01 0.739637 2.389500 -0.990517 0.799171
2000-01-02 -0.443838 1.018943 -1.359769 -0.260771
2000-01-03 0.563243 1.176798 0.260361 0.940663
2000-01-04 -0.533139 -0.308924 -0.282023 1.899379
2000-01-05 -0.521157 0.688589 0.153674 2.948773
In [29]:
df_ts.plot(secondary_y=['C', 'D'], mark_right=True)
plt.show()
In [33]:
# Another way for dual y-axis
fig=plt.figure()
ax0=fig.add_subplot(111)
df_ts.plot(use_index=True, y=['A','B'],ax=ax0)
ax1=ax0.twinx()
df_ts.plot(use_index=True, y=['C','D'], ax=ax1) # You can then setup color and legend place respectively
plt.show()

Seaborn¶

In [34]:
df_iris = pd.read_csv('iris.csv') # a dataframes
df_iris.head()
Out[34]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

box plot¶

In [35]:
import seaborn as sns
sns.set_style('whitegrid')
#stateful
sns.boxplot(x='species',y='sepal_length',data=df_iris)
plt.xticks(rotation=-45) # adjust xticks
plt.title('Aris species sepal_length boxplox') # add title
plt.show()
In [36]:
# boxplot UK bank client balance by age group, using seaborn and matplotlib
# stateless(OOP)
fig,ax2=plt.subplots(nrows=1)
sns.boxplot(x='age_group',y='balance',data=df,ax=ax2) # connect sns and matplotlib
ax2.grid(False)
ax2.set_title("Balance boxplot sliced by age_group")
plt.show()

dist plot¶

In [39]:
# Balance distribution by difference groups
# stateless(OOP)
fig = plt.figure()
ax3 = fig.add_subplot(1,1,1)

sns.distplot(df.loc[df['age_group']=='15-30','balance'],label='15-30', hist=False,ax=ax3)
sns.distplot(df.loc[df['age_group']=='30-50','balance'],label='30-50', hist=False,ax=ax3)
sns.distplot(df.loc[df['age_group']=='Above 50','balance'],label='Above 50', hist=False,ax=ax3)

ax3.grid(False)
ax3.set_title("Balance distribution by age_group")
ax3.legend()
plt.show()

bar plot¶

In [45]:
# barplot: return as object
df_titanic=sns.load_dataset('titanic')
g=sns.barplot(x="sex", y="survived", hue="class", ci=None,data=df_titanic)
g.set_ylabel('survival rate')
g.set_yticklabels(['{:3.2f}%'.format(x*100) for x in g.get_yticks()]) # if you want to show percentage for yticklabels
plt.show()

scatter plot¶

In [46]:
sns.pairplot(data=df_iris, hue="species") # feature correlation 
plt.show()

Multiple Charts¶

In [54]:
df_iris.head()
Out[54]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [56]:
fig=plt.figure(figsize=(8,16)) # X: control width, Y: control length
plt.subplot(2,1,1) # or ax0=fig.add_subplot(2,1,1)
sns.boxplot(data=df_iris, x='species',y='sepal_length')
plt.xticks(rotation=-45) # adjust xticks
plt.title('Iris species sepal_length boxplox') # add title

plt.subplot(2,1,2)
sns.distplot(df_iris['sepal_length'])

plt.show()

factor plot¶

In [57]:
df_factor = pd.read_csv('factor.csv')
df_factor=df_factor.iloc[:,[1,2,3,4,5]]
df_factor.head()
Out[57]:
id diet pulse time kind
0 1 low fat 85 1 min rest
1 1 low fat 85 15 min rest
2 1 low fat 88 30 min rest
3 2 low fat 90 1 min rest
4 2 low fat 92 15 min rest
In [61]:
# g is a sns object
g=sns.catplot(x='time', 
              y='pulse', 
              data=df_factor, 
              hue='diet',  # Color by diet
              col='diet',  # Separate by diet
              kind='box') # Swarmplot
 
# Rotate x-axis labels
g.set_xticklabels(rotation=-45)
plt.show()

regression plot¶

In [62]:
x = np.linspace(1,50,num=100)
epsilon = np.random.normal(0,3,size=100)
dt_lin = pd.DataFrame({'x':x, 'y':0.2+0.2*x + epsilon})

sns.set_style('whitegrid')
sns.regplot(x='x',y='y',data=dt_lin)

plt.show()
In [63]:
fig=plt.figure(figsize=(8,4))
ax0=fig.add_subplot(121)
sns.regplot(x='x',y='y',data=dt_lin,ax=ax0)

ax1=fig.add_subplot(122)
sns.regplot(x='x',y='y',data=dt_lin,ax=ax1)
ax1.grid(False)
ax1.set_title('Clean linear regression')

plt.show()
In [72]:
x = np.linspace(1,50, num = 100)
dt_poly=pd.DataFrame({'x':x,'y':0.2+0.3*np.power(x,2)})
sns.regplot(x='x', y='y',data=dt_poly,order=2, ci=None, scatter_kws={"s": 20, 'color': 'r'});
plt.show()

Case¶

In [73]:
df_network = pd.read_csv('phone_data.csv')
df_network.head()
Out[73]:
index date duration item month network network_type
0 0 15/10/14 06:58 34.429 data 2014-11 data data
1 1 15/10/14 06:58 13.000 call 2014-11 Vodafone mobile
2 2 15/10/14 14:46 23.000 call 2014-11 Meteor mobile
3 3 15/10/14 14:48 4.000 call 2014-11 Tesco mobile
4 4 15/10/14 17:27 4.000 call 2014-11 Tesco mobile
In [74]:
df_network['date'] = pd.to_datetime(df_network['date'],format="%d/%m/%y %H:%M")
# df_network['date'] = df_network['date'].dt.date
# df_network['date'] = pd.to_datetime(df_network['date'],format="%Y-%m-%d")
df_network_agg = df_network.query("item in ('data','call')").groupby(['date','network','item'])['duration'].mean().reset_index()
df_network_agg.head()
Out[74]:
date network item duration
0 2014-10-15 06:58:00 Vodafone call 13.000
1 2014-10-15 06:58:00 data data 34.429
2 2014-10-15 14:46:00 Meteor call 23.000
3 2014-10-15 14:48:00 Tesco call 4.000
4 2014-10-15 17:27:00 Tesco call 4.000
In [77]:
# 7 days moving average of duration
df_network_agg = df_network_agg.sort_values(by=['date','network','item'])
df_network_agg_mavg_temp = df_network_agg.groupby(['network','item'])['duration'].transform(lambda x:x.rolling(7,1).mean().round(0))
df_network_agg_mavg_temp.head()
Out[77]:
0    13.0
1    34.0
2    23.0
3     4.0
4     4.0
Name: duration, dtype: float64
In [78]:
df_network_agg_mavg_temp = pd.merge(df_network_agg,df_network_agg_mavg_temp,how='inner',left_index=True,
                                    right_index=True,suffixes=['','_7davg'])
df_network_agg_mavg_temp.fillna(0,inplace=True)
df_network_agg_mavg_temp.head()
Out[78]:
date network item duration duration_7davg
0 2014-10-15 06:58:00 Vodafone call 13.000 13.0
1 2014-10-15 06:58:00 data data 34.429 34.0
2 2014-10-15 14:46:00 Meteor call 23.000 23.0
3 2014-10-15 14:48:00 Tesco call 4.000 4.0
4 2014-10-15 17:27:00 Tesco call 4.000 4.0
In [81]:
# plot a linechart, using seaborn, specify the hue and hue_order, line_style
network_order = df_network_agg_mavg_temp.query("date=='2014-11-01'").sort_values(by=['duration_7davg'])['network'].tolist()
network_order = df_network_agg_mavg_temp['network'].unique().tolist()

# specify colors, using palette
color_shaded_blue=['#1967D2','#1A73E8','#4285F4','#8AB4F8','#AECBFA','#D2E3FC','#E8F0FE'] # 蓝色由深到浅

fig,ax0 = plt.subplots(1,1,figsize=(14,10))

sns.lineplot(x='date',y='duration_7davg',hue='network',hue_order=network_order,
             style='network',
             style_order=network_order,
             palette=color_shaded_blue,
             data=df_network_agg_mavg_temp,ax=ax0,ci=False)
ax0.grid(False)
ax0.legend(loc=2,bbox_to_anchor=(1,1),prop={'size':15},frameon=False)
ax0.set_xlabel('')
ax0.set_ylabel('',fontsize=16)
ax0.set_title('Test title', fontdict={'fontsize':18,'fontweight':'medium'})
for label in (ax0.get_xticklabels()+ax0.get_yticklabels()):
    label.set_fontsize(13)
sns.despine() # remove top and right spines
ax0.spines['left'].set_visible(False) # remove left spine
ax0.spines['bottom'].set_visible(False) # remove right spine
plt.show()

Plotly¶

In [84]:
import plotly
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px
In [85]:
# Use British Bank data as example: 
dt_region_job_nums = df.groupby(['region','job'])['cust_id'].count().reset_index()
nodes=list(set(dt_region_job_nums['region'].values.tolist() + dt_region_job_nums['job'].values.tolist()))
dt_region_job_nums['indice_region'] = dt_region_job_nums['region'].apply(lambda x: nodes.index(x))
dt_region_job_nums['indice_job'] = dt_region_job_nums['job'].apply(lambda x: nodes.index(x))

dt_region_job_nums.head()
Out[85]:
region job cust_id indice_region indice_job
0 England Blue Collar 344 2 0
1 England Other 314 2 1
2 England White Collar 1501 2 4
3 Northern Ireland Blue Collar 41 3 0
4 Northern Ireland Other 105 3 1
In [86]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = nodes,
      color = "blue"
    ),
    link = dict(
      source = dt_region_job_nums['indice_region'], # indices correspond to labels
      target = dt_region_job_nums['indice_job'], # indices correspond to labels
      value = dt_region_job_nums['cust_id']
  ))])

fig.update_layout(title_text="Customer region and job sankey chart", font_size=10)
fig.show()

# more details of Sankey chart: https://plotly.com/python/sankey-diagram/
In [87]:
# Plotly express
import plotly.express as px
my_template = 'plotly_dark'

df = px.data.gapminder()
df_2007 = df.query("year==2007")
fig = px.scatter(df_2007, x="gdpPercap", y="lifeExp", 
                 size = 'pop', 
                 hover_data=['country'], # df_2007.columns
                 color="continent",
                 title=f"""Life exp in '{my_template}'""", 
                 log_x=True,size_max=60)
fig.update_layout(xaxis={"title":'gdpPercap',
                         "title_font":dict(size=15)},
                  yaxis={"title":'lifeExp',
                         "title_font":dict(size=15)
                  },
                  title_font_color='white',
                  title_x=0.5,
                  template=my_template 
                  )

fig.show()
In [88]:
df = px.data.gapminder().query("year == 2007")
fig = px.sunburst(df, path=['continent', 'country'], values='pop',
                  color='lifeExp', hover_data=['iso_alpha'])
fig.update_layout(template=my_template)
fig.show()

Altair¶

In [93]:
import altair as alt
from vega_datasets import data
cars = data.cars()

alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()
Out[93]: